This Rmarkdown will perform some comparisons against the carbon usage stats generate as part of DRAM/GROW to various carbon metrics, namely compound classes. We’ll be (tentatively) using the S19S data published in Garayburu-Caruso et al., 2020. This was generated by aligning the surface water and sediment samples together, so we’ll need to trim the sediment samples out.

A secondary function of this Rmarkdown is to generate summary files based off of the FTICR-MS data so that additional correlations can be performed by others.

setup

Loading in packages, setting options, and such

# options
merge.reps = F

# organizational packages
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr)
library(ggthemes)
library(ggrepel)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library(Rfast)
## Loading required package: Rcpp
## Loading required package: RcppZiggurat
## Loading required package: RcppParallel
## 
## Attaching package: 'RcppParallel'
## 
## The following object is masked from 'package:Rcpp':
## 
##     LdFlags
## 
## 
## Rfast: 2.0.9
##  ___ __ __ __ __    __ __ __ __ __ _             _               __ __ __ __ __     __ __ __ __ __ __   
## |  __ __ __ __  |  |  __ __ __ __ _/            / \             |  __ __ __ __ /   /__ __ _   _ __ __\  
## | |           | |  | |                         / _ \            | |                        / /          
## | |           | |  | |                        / / \ \           | |                       / /          
## | |           | |  | |                       / /   \ \          | |                      / /          
## | |__ __ __ __| |  | |__ __ __ __           / /     \ \         | |__ __ __ __ _        / /__/\          
## |    __ __ __ __|  |  __ __ __ __|         / /__ _ __\ \        |_ __ __ __ _   |      / ___  /           
## |   \              | |                    / _ _ _ _ _ _ \                     | |      \/  / /       
## | |\ \             | |                   / /           \ \                    | |         / /          
## | | \ \            | |                  / /             \ \                   | |        / /          
## | |  \ \           | |                 / /               \ \                  | |       / /          
## | |   \ \__ __ _   | |                / /                 \ \     _ __ __ __ _| |      / /          
## |_|    \__ __ __\  |_|               /_/                   \_\   /_ __ __ __ ___|      \/             team
## 
## Attaching package: 'Rfast'
## 
## The following object is masked from 'package:dplyr':
## 
##     nth
## 
## The following objects are masked from 'package:purrr':
## 
##     is_integer, transpose
# analytical
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
# functions
johnston_set = function(mol){
  # Johnston SE et al., 2021 - Biogeochemistry
  # https://link.springer.com/article/10.1007/s10533-021-00876-7#Sec2
  
  # empty object
  johnston_set = rep(NA, nrow(mol))
  
  # boundaries
  johnston_set[which(mol$AI_Mod < 0.5 & 
                       mol$HtoC_ratio < 1.5 & 
                       mol$OtoC_ratio < 0.5)] = "Highly Unsaturated and Phenolic, low O/C"
  johnston_set[which(mol$AI_Mod < 0.5 & 
                       mol$HtoC_ratio < 1.5 & 
                       mol$OtoC_ratio >= 0.5)] = "Highly Unsaturated and Phenolic, high O/C"
  johnston_set[which(mol$HtoC_ratio >= 1.5, mol$N == 0)] = "Aliphatic"
  johnston_set[which(mol$AI_Mod >= 0.67)] = "Condensed Aromatic"
  johnston_set[which(mol$AI_Mod > 0.5 & mol$AI_Mod < 0.67)] = "Polyphenolic"
  johnston_set[which(mol$HtoC_ratio >= 1.5, mol$N > 0)] = "Peptide-like"
  johnston_set[which(mol$OtoC_ratio > 0.9)] = "Sugar-like"
  
  # write out
  return(johnston_set)
  
} # could be case_when, but nervous

roebuck_set = function(mol){
  # Roebuck JA et al., 2022 - Geophyical Research Letters
  # https://agupubs.onlinelibrary.wiley.com/doi/full/10.1029/2022GL099535
  # modified from Seidel et al., 2015 - Marine Chemistry
  # https://www.sciencedirect.com/science/article/pii/S0304420315300049
  
  # empty object
  roebuck_set = rep(NA, nrow(mol))
  
  # boundaries
  roebuck_set[which(mol$AI_Mod >= 0.67 & mol$C >= 15)] = "Polycyclic Aromatic"
  roebuck_set[which(mol$AI_Mod >= 0.67 & mol$C >= 15 & 
                      (mol$N > 0 | mol$S > 0 | mol$P > 0))] = "Polycyclic Aromatic w/ Heteroatoms"
  roebuck_set[which(mol$AI_Mod >= 0.67 & mol$C < 15 &
                      (mol$N > 0 | mol$S > 0 | mol$P > 0))] = "High Aromaticity (<15C)"
  roebuck_set[which(mol$AI_Mod >= 0.5 & mol$AI_Mod < 0.67)] = "Highly Aromatic including Polyphenols"
  roebuck_set[which(mol$AI_Mod >= 0.5 & mol$AI_Mod < 0.67 &
                      (mol$N > 0 | mol$S > 0 | mol$P > 0))] = "Highly Aromatic w/ Heteroatoms"
  roebuck_set[which(mol$AI_Mod < 0.5 & mol$HtoC_ratio < 1.5)] = "Highly Unsaturated"
  roebuck_set[which(mol$AI_Mod < 0.5 & mol$HtoC_ratio < 1.5 &
                      (mol$N > 0 | mol$S > 0 | mol$P > 0))] = "Highly Unsaturated w/ Heteroatoms"
  roebuck_set[which(mol$HtoC_ratio >= 1.5 & mol$HtoC_ratio < 2)] = "Unsaturated Aliphatics"
  roebuck_set[which(mol$HtoC_ratio >= 1.5 & mol$HtoC_ratio < 2 &
                      (mol$N > 0 | mol$S > 0 | mol$P > 0))] = "Unsaturated Aliphatics w/ Heteroatoms (including peptides)"
  roebuck_set[which(mol$HtoC_ratio >= 2 & mol$OtoC_ratio < 0.9)] = "Saturated Compounds (including lipids)"
  roebuck_set[which(mol$HtoC_ratio >= 2 & mol$OtoC_ratio < 0.9 &
                      (mol$N > 0 | mol$S > 0 | mol$P > 0))] = "Saturated Compounds w/ Heteroatoms"
  roebuck_set[which(mol$HtoC_ratio >= 2 & mol$OtoC_ratio >= 0.9)] = "Saturated Compounds (including carbohydrates)"
  roebuck_set[which(mol$mol$HtoC_ratio >= 2 & mol$OtoC_ratio >= 0.9 &
                      (mol$N > 0 | mol$S > 0 | mol$P > 0))] = "Carbohydrate-like w/ Heteroatoms (including amino sugars)"
  
  # write out
  return(roebuck_set)
}

# define boundary set plot function
summary_plot = function(summary, title){
  summary %>% gather(Sample, Count, -Class) %>%
    group_by(Sample) %>%
    mutate(`Relative Count (%)` = (Count/sum(Count))*100) %>%
    ungroup() %>%
    left_join(carbon.use, by = c("Sample" = "code"), 
              relationship = "many-to-many") %>%
    filter(!is.na(abund)) %>%
    mutate(abund = abund*100) %>%
    rename(`Relative Expression (%)` = abund) %>%
    ggplot(aes(x = `Relative Expression (%)`, y = `Relative Count (%)`))+
    geom_point(aes(color = Guild))+
    geom_smooth(method = "lm")+
    scale_color_manual(values = color)+
    # stat_cor(method = "spearman")+
    facet_grid(Class~Guild, scales = "free", labeller = label_wrap_gen(width=15))+
    ggtitle(title)+
    theme_bw()+
    theme(legend.position = "none")
}

# define boundary set correlation function
summary_stat = function(summary){
  # merge summary to carbon use
  unique.class = unique(summary$Class)
  unique.guild = unique(carbon.use$Guild)
  
  # empty object
  correlations = NULL
  
  # looping through pairwise comparisons
  for(i in 1:length(unique.class)){
    for(j in 1:length(unique.guild)){
      # select current comparisons
      curr.class = unique.class[i]
      curr.guild = unique.guild[j]
      
      # create temporary object
      temp = summary %>% gather(code, Count, -Class) %>%
        group_by(code) %>% mutate(Rel_Count = (Count/sum(Count)*100)) %>%
        filter(Class == curr.class) %>%
        left_join(carbon.use[which(carbon.use$Guild %in% curr.guild),], by = "code") %>%
        filter(!is.na(Guild)) %>%
        mutate(abund = abund*100)
      
      # run spearman correlation
      temp = rcorr(temp$Rel_Count, temp$abund)
      
      # build output
      temp = data.frame(Class = curr.class, Guild = curr.guild, 
                        rho = temp$r[1,2], p_value = temp$P[1,2])
      
      # add to correlation object
      correlations = rbind(correlations, temp)
      
    }
  }
  
  # return correlations
  return(correlations)
}

# import lambda functions
source("~/Documents/Code Information/FTICR Scripts/R Functions/getLambda.R")

# define colors
color = c(`chemolithoautotroph` = "#8CADD2",
          `heterotroph-aromatic` = "#198035",
          `heterotroph-polymer` = "#093614",
          `heterotroph-sugar` = "#1F7C37",
          `methyl_c1` = "#7D52A7",
          `SCFA  and alcohol conversions` = "#7A82BA",
          `heterotroph-ch4` = "#72006D")
color = color[order(names(color))]

# knitr options
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = "~/Documents/PNNL Analyses/GROW Paper/")

loading

Handles the import of data

# set working directory (redunant)
setwd( "~/Documents/PNNL Analyses/GROW Paper/")

# load in ICR data
if(merge.reps){ # load in unmerged data if merge.data == T
  data = read.csv("Processed_S19S_Sed-Wat_8.12_Data.csv", row.names = 1)
  mol = read.csv("Processed_S19S_Sed-Wat_8.12_Mol.csv", row.names = 1)
} else { # load in merged data
  data = read.csv("Merged_S19S_Wat_8.12_Data.csv", row.names = 1)
  mol = read.csv("Merged_S19S_Wat_8.12_Mol.csv", row.names = 1)
}

# load in transformation data
if(merge.reps){ # if the data isn't merged, there is no transformation profile to load
  print("Transformation data has not been generated yet.")
} else{ # loading in transformation data
  trans = read.csv("Merged_S19S_Wat_8.12_Trans_Profiles.csv", row.names = 1)
}

# load in carbon use data
carbon.use = read_xlsx("carbon_usage.xlsx")

# load in metadata
metadata = read_xlsx("Table S1 - S19S Metadata.xlsx")

# confirm matching data and mol
if(!identical(row.names(data), row.names(mol))){
  stop("The FTICR-MS data objects do not having matching masses.")
}

Merge reps (if necessary)

This section handles the merging of reps - a peak must be present in at least two out of three reps for a sample before it is considered “present” in the sample. Not only does this improve the confidence of the data, it also removes noise.

This section is also dropping the sediment data from the data object.

# merge replicates
if(merge.reps){ # checking to see if the merge flag is active
  # list files to see if merging already happened (computational insurance)
  file.list = list.files(pattern = "Merged")
  
  if(length(file.list) > 0){
    stop("You've already merged the data, so I'm saving you some time (even though you are just me who made a mistake).")
  }
  
  # removing sediment samples
  data = data[,-grep("Sed_Field", colnames(data))]
  
  # removing peaks present only in sediment
  mol = mol[-which(rowSums(data) == 0),]
  data = data[-which(rowSums(data) == 0),]
  
  # identify samples
  samp = metadata$ID
  
  # create object to store data
  mean.data = as.data.frame(matrix(data = NA, 
                                   nrow = nrow(data), 
                                   ncol = length(samp),
                                   dimnames = list(row.names(data),
                                                   samp)))
  # loop through samples and merge
  for(curr.col in colnames(mean.data)){
    
    # find samples
    temp = as.data.frame(data[,grep(paste0(curr.col, "_"), colnames(data))])
    
    # merge replicates, checking to ensure replicates exist...
    if(ncol(temp) == 1){
      temp[temp > 1] = 0  # If there is only one replicate for a sample, 
      # I'm setting intensities to zero to so it gets 
      # filtered
    } else {
      w = which(apply(temp, 1, function(x) length(which(x > 0))) >= 2) # find peaks with values in two reps
      temp[-w,] = 0 # set intensities to 0 if peaks missing in >2 reps
    }
    
    # average values together and fill in empty object
    mean.data[,curr.col] = apply(temp, 1, mean, na.rm = T)
    
    # clean up
    rm(temp, w)
  }
  
  # S19S_0001 is missing, dropping it
  mean.data = mean.data[,!is.na(mean.data[1,])]
  
  # setting mean.data to data
  data = mean.data
  rm(mean.data)
  
  # remove peaks dropped during merging
  mol = mol[-which(rowSums(data) == 0),]
  data = data[-which(rowSums(data) == 0),]
  
  # writing data out
  write.csv(data, "Merged_S19S_Wat_8.12_Data.csv", quote = T)
  write.csv(mol, "Merged_S19S_Wat_8.12_Mol.csv", quote = T)
  
}

Generating summary metrics

Summarizes our FTICR-MS data by various boundary sets and some cheminformatic derived variables (e.g., NOSC, AI-Mod, etc.).

### preprocessing
# calculating new boundary sets
mol$bs_johnston = johnston_set(mol)
mol$bs_roebuck = roebuck_set(mol)

# select only assigned formulas
data = data[!is.na(mol$MolForm),]
mol = mol[!is.na(mol$MolForm),]

# calculate lambda
get_comp = get_compositions(mol) # lambda requires specific elcomp formulation
lambda = as.data.frame(get_lambda(get_comp$chemical_compositions)) # calc lambda

# name lambda object
names <- rep("", 62)
names[1:12] <- c("delGcox0","delGd0","delGcat0","delGan0","delGdis0","lambda0",
                 "delGcox","delGd","delGcat","delGan","delGdis","lambda")
stoich_colnames <- c("donor","h2o","hco3","nh4","hpo4","hs","h","e","acceptor","biom")
stoich_types <- c("stoichD","stoichA","stoichCat","stoichAn","stoichMet")

for (i in 1:length(stoich_types)) {
  names[((i-1)*10+13):(i*10+12)] <- array(sapply(stoich_types[i], paste, stoich_colnames, sep="_"))
}

colnames(lambda) <- names

# rearrange lambda
lambda['MolForm'] <- get_comp$formulas
lambda = as.data.frame(lambda[,c("MolForm", "delGcox0", "delGcox", "lambda0", "lambda", "delGd0", "delGd")])
colnames(lambda) = c("MolForm","delGcox0PerCmol","delGcoxPerCmol", "lamO20","lamO2","delGd0","delGd")
lambda = lambda[!duplicated(lambda$MolForm),]

# add lamba to mol
mol = mol %>% left_join(lambda, by = "MolForm")

# cleanup
rm(names, stoich_colnames, stoich_types, get_comp, lambda)

### summarize metrics by average
# create object
metric_summary = data.frame(Sample = colnames(data), AI_Mod = NA, DBE = NA, 
                            NOSC = NA, lambda = NA, HtoC = NA, OtoC = NA, 
                            NtoC = NA, PtoC = NA, NtoP = NA)

# loop through samples and average the metrics
for(i in 1:ncol(data)){
  # select formulas from sample
  w = which(data[,i] > 0)
  
  # temporary mol
  temp = mol[w,]
  
  # average values
  metric_summary$AI_Mod[i] = mean(temp$AI_Mod)
  metric_summary$DBE[i] = mean(temp$DBE)
  metric_summary$NOSC[i] = mean(temp$NOSC)
  metric_summary$lambda[i] = mean(temp$lamO2)
  metric_summary$HtoC[i] = mean(temp$HtoC_ratio)
  metric_summary$OtoC[i] = mean(temp$OtoC_ratio)
  metric_summary$NtoC[i] = mean(temp$NtoC_ratio)
  metric_summary$PtoC[i] = mean(temp$PtoC_ratio)
  metric_summary$NtoP[i] = mean(temp$NtoP_ratio, na.rm = T)
  
}

# add in molecular formula richness
metric_summary = metric_summary %>%
  left_join(data.frame(Sample = colnames(data),
                       Richness = apply(data, 2, function(x) length(which(x > 0)))),
            by = "Sample")


### summarize by Bailey set 
# https://www.sciencedirect.com/science/article/pii/S0038071716306447#sec2

# create empty object
bailey_set_summary = data.frame(Class = unique(mol$bs2_class))

# loop through samples, counting classes
for(i in 1:ncol(data)){
  # select formulas from sample
  w = which(data[,i] > 0)
  
  # count classes of detected formulas
  temp = data.frame(table(mol$bs2_class[w]))
  
  # adjust column names
  colnames(temp) = c("Class", colnames(data)[i])
  
  # add counts to previously created object
  bailey_set_summary = bailey_set_summary %>%
    left_join(temp, by = c("Class"))
  
}

### summarize by Johnston set
# create empty object
johnston_set_summary = data.frame(Class = unique(mol$bs_johnston))

# loop through samples, counting classes
for(i in 1:ncol(data)){
  # select formulas from sample
  w = which(data[,i] > 0)
  
  # count classes of detected formulas
  temp = data.frame(table(mol$bs_johnston[w]))
  
  # adjust column names
  colnames(temp) = c("Class", colnames(data)[i])
  
  # add counts to previously created object
  johnston_set_summary = johnston_set_summary %>%
    left_join(temp, by = c("Class"))
  
}

### summarize by Roebuck set
# create empty object
roebuck_set_summary = data.frame(Class = unique(mol$bs_roebuck))

# loop through samples, counting classes
for(i in 1:ncol(data)){
  # select formulas from sample
  w = which(data[,i] > 0)
  
  # count classes of detected formulas
  temp = data.frame(table(mol$bs_roebuck[w]))
  
  # adjust column names
  colnames(temp) = c("Class", colnames(data)[i])
  
  # add counts to previously created object
  roebuck_set_summary = roebuck_set_summary %>%
    left_join(temp, by = c("Class"))
  
}

rm(w,i)

# cleaning up
bailey_set_summary = bailey_set_summary[!is.na(bailey_set_summary$Class),]
johnston_set_summary = johnston_set_summary[!is.na(johnston_set_summary$Class),]
roebuck_set_summary = roebuck_set_summary[!is.na(roebuck_set_summary$Class),]

# writing summaries
write.csv(metric_summary, "Summaries/Merged_S19S_Wat_8.12_Metric-Summary.csv",
          quote = F, row.names = F)
write.csv(bailey_set_summary, "Summaries/Merged_S19S_Wat_8.12_Bailey-Summary.csv",  
          quote = F, row.names = F)
write.csv(johnston_set_summary, "Summaries/Merged_S19S_Wat_8.12_Johnston-Summary.csv", 
          quote = F, row.names = F)
write.csv(roebuck_set_summary, "Summaries/Merged_S19S_Wat_8.12_Roebuck-Summary.csv", 
          quote = F, row.names = F)

Relating boundary sets to carbon use

Performing the correlations between the carbon use traits and the various FTICR-MS derived variables.

### Metric summary relationships
# plot
metric_summary %>% gather(Variable, Value, -Sample) %>%
  group_by(Sample) %>%
  left_join(carbon.use, by = c("Sample" = "code"), 
            relationship = "many-to-many") %>%
  filter(!is.na(abund)) %>%
  mutate(abund = abund*100) %>%
  rename(`Relative Expression (%)` = abund) %>%
  ggplot(aes(x = `Relative Expression (%)`, y = Value))+
  geom_point(aes(color = Guild))+
  geom_smooth(method = "lm")+
  scale_color_manual(values = color)+
  facet_grid(Variable~Guild, scales = "free", labeller = label_wrap_gen(width=15))+
  ggtitle("DOM Metric Relationships")+
  theme_bw()+
  theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'

ggsave("Figures/Guild-Metric_Relationships.pdf", height = 12, width = 12)
## `geom_smooth()` using formula = 'y ~ x'
# stats
unique.metric = colnames(metric_summary)
unique.guild = unique(carbon.use$Guild)

# empty object
metric_stats = NULL

# looping through pairwise comparisons
for(i in 2:length(unique.metric)){
  for(j in 1:length(unique.guild)){
    # select current comparisons
    curr.metric = unique.metric[i]
    curr.guild = unique.guild[j]
    
    # create temporary object
    temp = metric_summary %>% 
      gather(Variable, Value, -Sample) %>%
      filter(Variable == curr.metric) %>%
      rename(code = Sample) %>%
      left_join(carbon.use[which(carbon.use$Guild %in% curr.guild),], by = "code") %>%
      filter(!is.na(Guild)) %>%
      mutate(abund = abund*100)
    
    # run spearman correlation
    temp = rcorr(temp$Value, temp$abund)
    
    # build output
    temp = data.frame(Metric = curr.metric, Guild = curr.guild, 
                      rho = temp$r[1,2], p_value = temp$P[1,2])
    
    # add to correlation object
    metric_stats = rbind(metric_stats, temp)
  }
}

rm(temp, i, j, curr.metric, curr.guild, unique.guild, unique.metric)

# adjust p-value
metric_stats$padj = p.adjust(metric_stats$p_value, method = "fdr")

### Bailey relationships
# plot
summary_plot(bailey_set_summary, "Bailey Boundary Set Relationships")
## `geom_smooth()` using formula = 'y ~ x'

ggsave("Figures/Guild-Bailey_Relationships.pdf", height = 12, width = 12)
## `geom_smooth()` using formula = 'y ~ x'
# real stats
bailey_stats = summary_stat(bailey_set_summary)
bailey_stats$padj = p.adjust(bailey_stats$p_value, method = "fdr")


### Johnston relationships
# plot
summary_plot(johnston_set_summary, "Johnston Boundary Set Relationships")
## `geom_smooth()` using formula = 'y ~ x'

ggsave("Figures/Guild-Johnston_Relationships.pdf", height = 12, width = 12)
## `geom_smooth()` using formula = 'y ~ x'
# real stats
johnston_stats = summary_stat(johnston_set_summary)
johnston_stats$padj = p.adjust(johnston_stats$p_value, method = "fdr")


### Roebuck relationships
# plot
summary_plot(roebuck_set_summary, "Roebuck Boundary Set Relationships")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 348 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 348 rows containing missing values (`geom_point()`).

ggsave("Figures/Guild-Roebuck_Relationships.pdf", height = 12, width = 12)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 348 rows containing non-finite values (`stat_smooth()`).
## Removed 348 rows containing missing values (`geom_point()`).
# real stats
roebuck_stats = summary_stat(roebuck_set_summary)
roebuck_stats$padj = p.adjust(roebuck_stats$p_value, method = "fdr")

# write stats
write.csv(metric_stats, "Correlations/Guild-Metric_Relationships.csv",
          quote = T, row.names = F)
write.csv(bailey_stats, "Correlations/Guild-Bailey_Relationships.csv",
          quote = T, row.names = F)
write.csv(johnston_stats, "Correlations/Guild-Johnston_Relationships.csv",
          quote = T, row.names = F)
write.csv(roebuck_stats, "Correlations/Guild-Roebuck_Relationships.csv",
          quote = T, row.names = F)

Relationships between methyl-C1 and transformations

# reorganizing the transformation object
trans = trans[,-1] %>% rownames_to_column("Transformation")
colnames(trans) = gsub("Sample_", "", colnames(trans))

# select methyl guild
methyl_guild = carbon.use[which(carbon.use$Guild %in% "methyl_c1"),]

# empty object
methyl_relation = NULL

# looping through pairwise comparisons
for(i in 1:nrow(trans)){
  # select current comparisons
  curr.trans = trans$Transformation[i]
  
  # create temporary object
  temp = trans %>% 
    gather(code, Count, -Transformation) %>%
    group_by(code) %>% mutate(Rel_Count = (Count/sum(Count)*100)) %>%
    filter(Transformation == curr.trans) %>%
    left_join(methyl_guild, by = "code") %>%
    filter(!is.na(abund)) %>%
    mutate(abund = abund*100)
  
  # run spearman correlation
  temp = rcorr(temp$Rel_Count, temp$abund)
  
  # build output
  temp = data.frame(Transformation = curr.trans, 
                    rho = temp$r[1,2], p_value = temp$P[1,2])
  
  # add to correlation object
  methyl_relation = rbind(methyl_relation, temp)
  
}

# adjust p-value
methyl_relation$padj = p.adjust(methyl_relation$p_value, method = "fdr")

# plotting methyl-C1H4 relationship for direct inspection
trans %>% gather(Sample, Count, -Transformation) %>%
  group_by(Sample) %>%
  mutate(`Relative Count (%)` = (Count/sum(Count))*100) %>%
  filter(Transformation == "C1H4") %>% ungroup() %>%
  gather(Variable, Value, -Transformation, -Sample) %>%
  left_join(methyl_guild, by = c("Sample" = "code")) %>%
  filter(!is.na(abund)) %>% mutate(abund = abund*100) %>%
  ggplot(aes(x = Value, y = abund))+
  geom_point(color = color["methyl_c1"])+
  geom_smooth(method = "lm")+
  facet_grid(.~Variable, scales = "free_x")+
  xlab("Transformation Value")+
  ylab("Relative Expression (%)")+
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

ggsave("Figures/Methyl-C1H4_Relationship.pdf") 
## Saving 7 x 5 in image
## `geom_smooth()` using formula = 'y ~ x'
# write stats
write.csv(methyl_relation, "Correlations/Methyl-Transformations_Relationships.csv")